In [1]:
# Settings
# See "./sample input files" for example - solr / lucene stop word file format (one word per line, text file)
STOP_WORDS_FILE = "/Users/simon.hughes/Software/Solr/solr-5.1.0/server/solr/DiceJobs/conf/dice_stop_words.txt"
# keywords generated in step 2, or manually configured from your search logs
# this is expected to be in the format of a solr / lucene / ES synonym file. The python code mimics the solr
# analysis chain logic, and can be used to filter the text to just those words and phrases in the keyword file
# See "./sample input files" for examples
KEY_WORD_FILES = ["/Users/simon.hughes/Documents/Dice Data/LuceneTalk/Phrases.txt",
# this is a file we generated from our search log analysis. The file above was generated in the previous step
# only one file is needed, it depends what keywords / phrases are important to your domain
"/Users/simon.hughes/Documents/Dice Data/LuceneTalk/top_5k_keywords.txt"]
DOCS_FOLDER = "/Users/simon.hughes/Documents/Dice Data/LuceneTalk/ProcessedDocs"
MODEL_FILE = "/Users/simon.hughes/Documents/Dice Data/LuceneTalk/keyword_model.w2v"
FILE_MASK = ".*\.txt"
MIN_SENT_LENGTH = 5
# W2Vec settings
MIN_WD_COUNT = 10 #for word2vec model # setting to 10 seems to remove some of the noise
WINDOW_SIZE = 5
VECTOR_SIZE = 300
WORKERS = 8
TRAINING_ITERATIONS = 15
In [2]:
# Shared
import re
from collections import defaultdict
def load_stop_words(stop_words_file):
stop_words = set()
with open(stop_words_file) as f:
for line in f:
word = line.strip()
if word[0] != "#":
word = word.lower()
stop_words.add(word)
return stop_words
re_collapse_spaces = re.compile("\s+")
def collapse_spaces(s):
return re_collapse_spaces.sub(" ", s)
re1 = re.compile("[;:\'\"\*/\),\(\|\s]+")
def clean_str(s):
s = str(s).replace("'s"," ")
#doesn't work in regex
s = s.replace("-", " ").replace("\\"," ")
s = re1.sub(" ",s).strip()
return collapse_spaces(s)
def find_files(folder, regex, remove_empty = False):
"""
Find all files matching the [regex] pattern in [folder]
folder : string
folder to search (not recursive)
regex : string (NOT regex object)
pattern to match
"""
files = os.listdir(folder)
matches = [os.path.abspath(os.path.join(folder, f))
for f in files
if re.search(regex, f, re.IGNORECASE)]
if remove_empty:
matches = [f for f in matches if os.path.getsize(f) > 0]
matches.sort()
return matches
In [3]:
from collections import defaultdict
class SynonymMapper(object):
def __init__(self, mapper, nested, case_sensitive=False):
self.case_sensitive = case_sensitive
self.mapper = mapper
self.nested = nested
self.synonyms = set()
for rhs in self.mapper.values():
for syn in rhs:
self.synonyms.add(syn)
def is_synonym(self, term):
return term in self.synonyms
def map_synonyms(self, tokens, debug=False):
mapped = []
size = len(tokens)
if not self.case_sensitive:
tmp_tokens = map(lambda s: s.lower(), tokens)
else:
tmp_tokens = tokens
ix = 0
while ix < size:
if debug:
print "ix", ix
best, best_key = None, None
tmp_ix = ix
max_ix = ix
current = ""
d = self.nested
while tmp_ix < size and tmp_tokens[tmp_ix] in d:
current += tmp_tokens[tmp_ix] + " "
key = current.strip()
if key in self.mapper:
if debug:
if best is not None:
print(ix, tmp_ix, "new best:", key, "=>", self.mapper[key])
else:
print(ix, tmp_ix, "best:", key, "=>", self.mapper[key])
best = self.mapper[key]
best_key = key
max_ix = tmp_ix
d = d[tmp_tokens[tmp_ix]]
tmp_ix += 1
if not best:
#retain original casing
mapped.append(tokens[ix])
else:
ix = max_ix
#yields a set
for item in sorted(best):
mapped.append(item)
ix += 1
return mapped
def __repr__(self):
return "Synonym Mapper: %i synonyms mapped" % len(self.mapper)
def build_synonym_filter(files, case_sensitive=False):
# recursively define a defaultdict generator
mapper = defaultdict(set)
def dd():
return defaultdict(dd)
nested_map = defaultdict(dd)
file_locn = dict()
if type(files) == str:
files = [files]
for f in files:
with open(f, "r+") as fin:
for line in fin:
line = line.strip()
if len(line) > 0 and not line[0] == "#":
if "=>" in line:
left, right = line.split("=>")
right = set(right.split(","))
left_parts = left.split(",")
else:
left_parts = line.split(",")
right = set(left_parts)
for syn in left_parts:
for rhs in right:
mapper[syn].add(rhs)
file_locn[syn] = f
tokens = syn.split(" ")
prev = tokens[0]
d = nested_map[prev]
for token in tokens[1:]:
d = d[token]
prev = token
return SynonymMapper(mapper, nested_map, case_sensitive)
In [4]:
#String processing
def white_space_tokenize(s):
return s.split(" ")
__punct__ = set(".?!,;:")
def remove_punct_at_end(s):
while len(s) > 1 and s[-1] in __punct__:
s = s[:-1]
return s
#Token Filters
def fact_len_filter(max_len):
def len_filter(tokens):
return filter(lambda s: len(s) >= max_len, tokens)
return len_filter
remove_empty_tokens_filter = fact_len_filter(1)
def lower_case_filter(tokens):
if type(tokens) == str:
return tokens.lower()
return map(lambda t: t.lower(), tokens)
__punct__ = set(".?!,;:")
def remove_punct_at_end_filter(tokens):
return map(remove_punct_at_end, tokens)
def fact_is_synonym_filter(syn_mapper):
def is_synonym_filter(tokens):
return filter(syn_mapper.is_synonym, tokens)
return is_synonym_filter
In [5]:
def fact_stop_word_filter(case_sensitive, stop_words_file):
stop_words = set()
with open(stop_words_file) as f:
for line in f:
word = line.strip()
if word[0] != "#":
if not case_sensitive:
word = word.lower()
stop_words.add(word)
def cs_stop_filter(tokens):
return [tok for tok in tokens if tok not in stop_words]
def stop_filter(tokens):
return [tok for tok in tokens if tok.lower() not in stop_words]
if case_sensitive:
return cs_stop_filter
else:
return stop_filter
stop_filter = fact_stop_word_filter(False, STOP_WORDS_FILE)
In [6]:
def analyze(s, filters):
temp = s
for f in filters:
temp = f(temp)
return temp
def debug_analyze(s, filters):
temp = s
pad = 20
print "START".ljust(pad), temp
for f in filters:
temp = f(temp)
if type(temp) == list:
s_temp = "|".join(map(str,temp))
else:
s_temp = str(temp)
print f.func_name.ljust(pad), s_temp
return temp
In [7]:
syn_mapper = build_synonym_filter(KEY_WORD_FILES, False)
syn_mapper
Out[7]:
In [8]:
#Skills from text
is_a_synonym_filter = fact_is_synonym_filter(syn_mapper)
analysis_chain = [clean_str,
white_space_tokenize,
remove_punct_at_end_filter,
lower_case_filter,
stop_filter,
syn_mapper.map_synonyms,
remove_empty_tokens_filter]
# is_a_synonym_filter] - Un-comment to just train on keywords.
# - Best to train on all words, and then filter learned synonyms to keywords
#Test
rslt = debug_analyze("$150k as400 Sr.\ Java/j2ee and the C#.! developer. FIT \"HOT\" dev. -IBM's business, sql server management",
analysis_chain)
In [9]:
import os, re, time
start = time.time()
sentences = []
files = find_files(DOCS_FOLDER, FILE_MASK, True)
print("%s files found in %s" % (len(files), DOCS_FOLDER))
documents = []
for i, fname in enumerate(files):
with open(fname) as f:
contents = f.read()
sentences.extend(contents.split("\n"))
end = time.time()
print("Loading %i sentences took %s seconds" % (len(sentences), str(end - start)))
In [10]:
print len(sentences)
tokenized = []
print("Tokenizing sentences")
for i, sent in enumerate(sentences):
tokens = analyze(sent, analysis_chain)
if len(tokens) >= MIN_SENT_LENGTH:
tokenized.append(tokens)
if i % 100000 == 0:
print(i)
In [ ]:
import gensim, time
from gensim.models.word2vec import Word2Vec
start = time.time()
print("Training Model. This could take a while (10-60 mins for moderate collections). Get a coffee")
model = Word2Vec(tokenized, iter=TRAINING_ITERATIONS, size=VECTOR_SIZE, window=WINDOW_SIZE, min_count=MIN_WD_COUNT, workers=WORKERS, sample=1e-5, hs=0, negative=20)
model.save(MODEL_FILE)
end = time.time()
print "Took %s seconds" % (end - start)
In [ ]:
#find the top n similar terms as below:
#model.most_similar(positive="hadoop developer",topn=10)